home
***
CD-ROM
|
disk
|
FTP
|
other
***
search
/
Aminet 24
/
Aminet 24 (1998)(GTI - Schatztruhe)[!][Apr 1998].iso
/
Aminet
/
comm
/
mail
/
Mutt089src.lha
/
Mutt-0.89i-AMIGA
/
src
/
rx
/
rxsuper.h
< prev
next >
Wrap
C/C++ Source or Header
|
1998-01-28
|
16KB
|
447 lines
/* classes: h_files */
#ifndef RXSUPERH
#define RXSUPERH
/* Copyright (C) 1995, 1996 Tom Lord
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU Library General Public License as published by
* the Free Software Foundation; either version 2, or (at your option)
* any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Library General Public License for more details.
*
* You should have received a copy of the GNU Library General Public License
* along with this software; see the file COPYING. If not, write to
* the Free Software Foundation, 59 Temple Place - Suite 330,
* Boston, MA 02111-1307, USA.
*/
/* lord Sun May 7 12:40:17 1995 */
#include "rxnfa.h"
/* This begins the description of the superstate NFA.
*
* The superstate NFA corresponds to the NFA in these ways:
*
* Superstate states correspond to sets of NFA states (nfa_states(SUPER)),
*
* Superstate edges correspond to NFA paths.
*
* The superstate has no epsilon transitions;
* every edge has a character label, and a (possibly empty) side
* effect label. The side effect label corresponds to a list of
* side effects that occur in the NFA. These parts are referred
* to as: superedge_character(EDGE) and superedge_sides(EDGE).
*
* For a superstate edge EDGE starting in some superstate SUPER,
* the following is true (in pseudo-notation :-):
*
* exists DEST in nfa_states s.t.
* exists nfaEDGE in nfa_edges s.t.
* origin (nfaEDGE) == DEST
* && origin (nfaEDGE) is a member of nfa_states(SUPER)
* && exists PF in possible_futures(dest(nfaEDGE)) s.t.
* sides_of_possible_future (PF) == superedge_sides (EDGE)
*
* also:
*
* let SUPER2 := superedge_destination(EDGE)
* nfa_states(SUPER2)
* == union of all nfa state sets S s.t.
* exists PF in possible_futures(dest(nfaEDGE)) s.t.
* sides_of_possible_future (PF) == superedge_sides (EDGE)
* && S == dests_of_possible_future (PF) }
*
* Or in english, every superstate is a set of nfa states. A given
* character and a superstate implies many transitions in the NFA --
* those that begin with an edge labeled with that character from a
* state in the set corresponding to the superstate.
*
* The destinations of those transitions each have a set of possible
* futures. A possible future is a list of side effects and a set of
* destination NFA states. Two sets of possible futures can be
* `merged' by combining all pairs of possible futures that have the
* same side effects. A pair is combined by creating a new future
* with the same side effect but the union of the two destination sets.
* In this way, all the possible futures suggested by a superstate
* and a character can be merged into a set of possible futures where
* no two elements of the set have the same set of side effects.
*
* The destination of a possible future, being a set of NFA states,
* corresponds to a supernfa state. So, the merged set of possible
* futures we just created can serve as a set of edges in the
* supernfa.
*
* The representation of the superstate nfa and the nfa is critical.
* The nfa has to be compact, but has to facilitate the rapid
* computation of missing superstates. The superstate nfa has to
* be fast to interpret, lazilly constructed, and bounded in space.
*
* To facilitate interpretation, the superstate data structures are
* peppered with `instruction frames'. There is an instruction set
* defined below which matchers using the supernfa must be able to
* interpret.
*
* We'd like to make it possible but not mandatory to use code
* addresses to represent instructions (c.f. gcc's computed goto).
* Therefore, we define an enumerated type of opcodes, and when
* writing one of these instructions into a data structure, use
* the opcode as an index into a table of instruction values.
*
* Below are the opcodes that occur in the superstate nfa.
*
* The descriptions of the opcodes refer to data structures that are
* described further below.
*/
enum rx_opcode
{
/*
* BACKTRACK_POINT is invoked when a character transition in
* a superstate leads to more than one edge. In that case,
* the edges have to be explored independently using a backtracking
* strategy.
*
* A BACKTRACK_POINT instruction is stored in a superstate's
* transition table for some character when it is known that that
* character crosses more than one edge. On encountering this
* instruction, the matcher saves enough state to backtrack to this
* point later in the match.
*/
rx_backtrack_point = 0, /* data is (struct transition_class *) */
/*
* RX_DO_SIDE_EFFECTS evaluates the side effects of an epsilon path.
* There is one occurence of this instruction per rx_distinct_future.
* This instruction is skipped if a rx_distinct_future has no side effects.
*/
rx_do_side_effects = rx_backtrack_point + 1,
/* data is (struct rx_distinct_future *) */
/*
* RX_CACHE_MISS instructions are stored in rx_distinct_futures whose
* destination superstate has been reclaimed (or was never built).
* It recomputes the destination superstate.
* RX_CACHE_MISS is also stored in a superstate transition table before
* any of its edges have been built.
*/
rx_cache_miss = rx_do_side_effects + 1,
/* data is (struct rx_distinct_future *) */
/*
* RX_NEXT_CHAR is called to consume the next character and take the
* corresponding transition. This is the only instruction that uses
* the DATA field of the instruction frame instead of DATA_2.
* The comments about rx_inx explain this further.
*/
rx_next_char = rx_cache_miss + 1, /* data is (struct superstate *) */
/* RX_BACKTRACK indicates that a transition fails. Don't
* confuse this with rx_backtrack_point.
*/
rx_backtrack = rx_next_char + 1, /* no data */
/*
* RX_ERROR_INX is stored only in places that should never be executed.
*/
rx_error_inx = rx_backtrack + 1, /* Not supposed to occur. */
rx_num_instructions = rx_error_inx + 1
};
/* The heart of the matcher is a `word-code-interpreter'
* (like a byte-code interpreter, except that instructions
* are a full word wide).
*
* Instructions are not stored in a vector of code, instead,
* they are scattered throughout the data structures built
* by the regexp compiler and the matcher. One word-code instruction,
* together with the arguments to that instruction, constitute
* an instruction frame (struct rx_inx).
*
* This structure type is padded by hand to a power of 2 because
* in one of the dominant cases, we dispatch by indexing a table
* of instruction frames. If that indexing can be accomplished
* by just a shift of the index, we're happy.
*
* Instructions take at most one argument, but there are two
* slots in an instruction frame that might hold that argument.
* These are called data and data_2. The data slot is only
* used for one instruction (RX_NEXT_CHAR). For all other
* instructions, data should be set to 0.
*
* RX_NEXT_CHAR is the most important instruction by far.
* By reserving the data field for its exclusive use,
* instruction dispatch is sped up in that case. There is
* no need to fetch both the instruction and the data,
* only the data is needed. In other words, a `cycle' begins
* by fetching the field data. If that is non-0, then it must
* be the destination state of a next_char transition, so
* make that value the current state, advance the match position
* by one character, and start a new cycle. On the other hand,
* if data is 0, fetch the instruction and do a more complicated
* dispatch on that.
*/
struct rx_inx
{
void * data;
void * data_2;
void * inx;
void * fnord;
};
#ifndef RX_TAIL_ARRAY
#define RX_TAIL_ARRAY 1
#endif
/* A superstate corr